# -*- coding: utf-8 -*-
# time: 2025/4/23 13:47
# file: qwen.py
# author: hanson

datasets=[
  {"instruction": "翻译成英文", "input": "今天天气很好", "output": "The weather is nice today"},
  {"instruction": "总结文章", "input": "长篇新闻内容...", "output": "摘要文本..."}
]
# 格式化数据 qwen 格式
def format_fn(x):
    prompt = f"Instruction: {x['instruction']}\nInput: {x['input']}\nOutput: {x['output']}"
    return {"text": prompt}

print(format_fn(datasets[0]))
# 格式化数据 deepseek 格式
def format_fn(x):
    text = f"<|im_start|>user\n{x['instruction']}: {x['input']}<|im_end|>\n<|im_start|>assistant\n{x['output']}<|im_end|>"
    return {"text": text}

print(format_fn(datasets[0]))